# load packages, installing if missing
if (!require(librarian)){
install.packages("librarian")
library(librarian)
}
librarian::shelf(
dismo, dplyr, DT, ggplot2, here, htmltools, leaflet, mapview, purrr, raster, readr, rgbif, rgdal, rJava, sdmpredictors, sf, spocc, tidyr, geojsonio)
select <- dplyr::select # overwrite raster::select
# set random seed for reproducibility
set.seed(42)
# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F)
American Black Bear
obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
# get species occurrence data from GBIF with coordinates
(res <- spocc::occ(
query = 'Ursus americanus',
from = 'gbif', has_coords = T,
limit = 10000))
## Searched: gbif
## Occurrences - Found: 18,880, Returned: 10,000
## Search type: Scientific
## gbif: Ursus americanus (10000)
(res_large_limit <- spocc::occ(
query = 'Ursus americanus',
from = 'gbif', has_coords = T,
limit = 100000))
## Searched: gbif
## Occurrences - Found: 18,880, Returned: 18,880
## Search type: Scientific
## gbif: Ursus americanus (18880)
# extract data frame from result
df <- res$gbif$data[[1]]
readr::write_csv(df, obs_csv)
anyDuplicated(df)
## [1] 0
# convert to points of observation from lon/lat columns in data frame
obs <- df %>%
filter(basisOfRecord == "HUMAN_OBSERVATION") %>%
sf::st_as_sf(
coords = c("longitude", "latitude"),
crs = st_crs(4326)) %>%
select(prov, key) # save space (joinable from obs_csv)
sf::write_sf(obs, obs_geo, delete_dsn=T)
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 9992
# show points on map
mapview::mapview(obs, map.types = "Esri.WorldImagery")
dir_env <- file.path(dir_data, "env")
# set a default data directory
options(sdmpredictors_datadir = dir_env)
# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)
# show table of datasets
#env_datasets %>%
# select(dataset_code, description, citation) %>%
#DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")
# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
#DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio11", "WC_bio10", "WC_bio4", "ER_tri", "ER_topoWet", "WC_bio12", "WC_bio7")
# get layers
env_stack <- load_layers(env_layers_vec)
# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)
obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
# make convex hull around points of observation
obs_hull <- sf::st_convex_hull(st_union(obs))
# save obs hull
write_sf(obs_hull, obs_hull_geo)
obs_hull <- read_sf(obs_hull_geo)
# show points on map
mapview(
list(obs, obs_hull))
obs_hull_sp <- sf::as_Spatial(obs_hull)
env_stack <- raster::mask(env_stack, obs_hull_sp) %>%
raster::crop(extent(obs_hull_sp))
writeRaster(env_stack, env_stack_grd, overwrite=T)
env_stack <- stack(env_stack_grd)
# show map
# mapview(obs) +
# mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)
## Pseudo-Absence
absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
# get raster count of observations
r_obs <- rasterize(
sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
# show map
#mapview(obs) +
mapview(r_obs)
# create mask for
r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
# generate random points inside mask
absence <- dismo::randomPoints(r_mask, nrow(obs)) %>%
as_tibble() %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
write_sf(absence, absence_geo, delete_dsn=T)
absence <- read_sf(absence_geo)
# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") +
mapview(absence, col.regions = "gray")
# combine presence and absence into single set of labeled points
pts <- rbind(
obs %>%
mutate(
present = 1) %>%
select(present, key),
absence %>%
mutate(
present = 0,
key = NA)) %>%
mutate(
ID = 1:n()) %>%
relocate(ID)
write_sf(pts, pts_geo, delete_dsn=T)
# extract raster values for points
pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>%
tibble() %>%
# join present and geometry columns to raster value results for points
left_join(
pts %>%
select(ID, present),
by = "ID") %>%
relocate(present, .after = ID) %>%
# extract lon, lat as single columns
mutate(
#present = factor(present),
lon = st_coordinates(geometry)[,1],
lat = st_coordinates(geometry)[,2]) %>%
select(-geometry)
write_csv(pts_env, pts_env_csv)
pts_env <- read_csv(pts_env_csv)
pts_env %>%
# show first 10 presence, last 10 absence
slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>%
DT::datatable(
rownames = F,
options = list(
dom = "t",
pageLength = 20))
pts_env %>%
select(-ID) %>%
mutate(
present = factor(present)) %>%
pivot_longer(-present) %>%
ggplot() +
geom_density(aes(x = value, fill = present)) +
scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
theme_bw() +
facet_wrap(~name, scales = "free") +
theme(
legend.position = c(1, 0),
legend.justification = c(1, 0))
librarian::shelf(
DT, dplyr, dismo, GGally, here, readr, tidyr)
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)
dir_data <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 19984
datatable(pts_env, rownames = F)
GGally::ggpairs(
select(pts_env, -ID),
aes(color = factor(present)))
## Linear Model
# setup model data
d <- pts_env %>%
select(-ID) %>% # remove terms we don't want to model
tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 19925
# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
##
## Call:
## lm(formula = present ~ ., data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26089 -0.35488 0.01379 0.32653 1.24786
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.567e+00 1.100e-01 32.419 < 2e-16 ***
## WC_alt -2.003e-04 1.265e-05 -15.840 < 2e-16 ***
## WC_bio1 4.557e-02 1.010e-02 4.509 6.54e-06 ***
## WC_bio11 -8.347e-02 1.716e-02 -4.865 1.15e-06 ***
## WC_bio10 2.038e-04 2.004e-02 0.010 0.991887
## WC_bio4 -1.787e-02 4.454e-03 -4.011 6.07e-05 ***
## ER_tri -4.738e-04 1.366e-04 -3.467 0.000527 ***
## ER_topoWet -1.269e-01 3.548e-03 -35.780 < 2e-16 ***
## WC_bio12 1.527e-04 1.042e-05 14.652 < 2e-16 ***
## WC_bio7 1.143e-02 2.307e-03 4.953 7.36e-07 ***
## lon -1.623e-03 3.376e-04 -4.808 1.54e-06 ***
## lat -3.151e-02 1.907e-03 -16.525 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4208 on 19913 degrees of freedom
## Multiple R-squared: 0.2922, Adjusted R-squared: 0.2919
## F-statistic: 747.5 on 11 and 19913 DF, p-value: < 2.2e-16
y_predict <- predict(mdl, d, type="response")
y_true <- d$present
range(y_predict)
## [1] -0.4193317 1.2663354
range(y_true)
## [1] 0 1
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
##
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"),
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0030 -0.8504 -0.1263 0.8330 2.9749
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.610e+01 7.161e-01 22.482 < 2e-16 ***
## WC_alt -1.004e-03 7.991e-05 -12.561 < 2e-16 ***
## WC_bio1 4.182e-01 6.351e-02 6.585 4.53e-11 ***
## WC_bio11 -6.236e-01 1.011e-01 -6.166 7.03e-10 ***
## WC_bio10 6.768e-03 1.165e-01 0.058 0.95368
## WC_bio4 -1.221e-01 2.604e-02 -4.687 2.77e-06 ***
## ER_tri -3.156e-03 8.286e-04 -3.809 0.00014 ***
## ER_topoWet -6.820e-01 2.246e-02 -30.368 < 2e-16 ***
## WC_bio12 9.594e-04 6.974e-05 13.758 < 2e-16 ***
## WC_bio7 6.334e-02 1.351e-02 4.690 2.74e-06 ***
## lon -1.021e-02 2.000e-03 -5.102 3.35e-07 ***
## lat -1.654e-01 1.187e-02 -13.936 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27622 on 19924 degrees of freedom
## Residual deviance: 20858 on 19913 degrees of freedom
## AIC: 20882
##
## Number of Fisher Scoring iterations: 4
y_predict <- predict(mdl, d, type="response")
range(y_predict)
## [1] 0.005246361 0.988990049
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F, ylim="free")
librarian::shelf(mgcv)
# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
formula = present ~ s(WC_alt) + s(WC_bio1) +
s(WC_bio11) + s(WC_bio10) + s(WC_bio4) + s(ER_tri) + s(ER_topoWet) + s(WC_bio12) + s(WC_bio7) + s(lon) + s(lat),
family = binomial, data = d)
summary(mdl)
##
## Family: binomial
## Link function: logit
##
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio11) + s(WC_bio10) +
## s(WC_bio4) + s(ER_tri) + s(ER_topoWet) + s(WC_bio12) + s(WC_bio7) +
## s(lon) + s(lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.39050 0.03172 -12.31 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(WC_alt) 8.696 8.970 197.65 < 2e-16 ***
## s(WC_bio1) 8.573 8.914 373.07 < 2e-16 ***
## s(WC_bio11) 8.826 8.963 189.03 < 2e-16 ***
## s(WC_bio10) 6.091 7.002 209.28 < 2e-16 ***
## s(WC_bio4) 8.759 8.970 197.60 < 2e-16 ***
## s(ER_tri) 4.063 5.224 23.16 0.000363 ***
## s(ER_topoWet) 7.573 8.438 269.48 < 2e-16 ***
## s(WC_bio12) 8.625 8.932 418.41 < 2e-16 ***
## s(WC_bio7) 8.453 8.898 100.77 < 2e-16 ***
## s(lon) 8.577 8.948 215.44 < 2e-16 ***
## s(lat) 8.983 9.000 565.08 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.483 Deviance explained = 41.4%
## UBRE = -0.17887 Scale est. = 1 n = 19925
# show term plots
plot(mdl, scale=0)
### Question: Which GAM environmental variables, and even range of values, seem to contribute most towards presence (above 0 response) versus absence (below 0 response)? ### The variables with the highest Chi.sq values were latitude, WC_bio12 (annual precipitation), and WC_bio1 (annual mean temp) with Chi.sq values of 565.08, 418.41, and 373.07 respectively.
# load extra packages
librarian::shelf(
maptools, sf)
mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")
# show version of maxent
if (!interactive())
maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)
# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>%
sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class
# fit a maximum entropy model
mdl <- maxent(env_stack, obs_sp)
## This is MaxEnt version 3.4.3
readr::write_rds(mdl, mdl_maxent_rds)
mdl <- read_rds(mdl_maxent_rds)
# plot variable contributions per predictor
plot(mdl)
response(mdl)
# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')
plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
# load packages
librarian::shelf(
caret, # m: modeling framework
dplyr, ggplot2 ,here, readr,
pdp, # X: partial dependence plots
ranger, # m: random forest modeling
rpart, # m: recursive partition modeling
rpart.plot, # m: recursive partition plotting
rsample, # d: split train/test data
skimr, # d: skim summarize data table
vip) # X: variable importance
# options
options(
scipen = 999,
readr.show_col_types = F)
set.seed(42)
# graphical theme
ggplot2::theme_set(ggplot2::theme_light())
# paths
dir_data <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>%
select(-ID) %>% # not used as a predictor x
mutate(
present = factor(present)) %>% # categorical response
na.omit() # drop rows with NA
skim(d)
| Name | d |
| Number of rows | 19925 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 11 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| present | 0 | 1 | FALSE | 2 | 0: 9970, 1: 9955 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| WC_alt | 0 | 1 | 752.80 | 710.81 | -59.00 | 230.00 | 470.00 | 1137.00 | 3637.00 | ▇▂▂▁▁ |
| WC_bio1 | 0 | 1 | 6.39 | 7.24 | -12.90 | 1.40 | 6.10 | 11.30 | 24.40 | ▁▅▇▅▂ |
| WC_bio11 | 0 | 1 | -5.26 | 10.39 | -31.40 | -11.80 | -4.80 | 2.60 | 19.60 | ▂▅▇▆▂ |
| WC_bio10 | 0 | 1 | 17.52 | 5.24 | -0.40 | 13.70 | 17.00 | 21.00 | 35.50 | ▁▅▇▃▁ |
| WC_bio4 | 0 | 1 | 88.94 | 28.63 | 18.52 | 69.92 | 85.05 | 108.08 | 166.89 | ▂▇▇▅▂ |
| ER_tri | 0 | 1 | 43.25 | 48.13 | 0.00 | 7.01 | 23.14 | 67.44 | 306.15 | ▇▂▁▁▁ |
| ER_topoWet | 0 | 1 | 10.71 | 1.92 | 6.60 | 9.07 | 10.76 | 12.32 | 15.31 | ▅▇▇▇▂ |
| WC_bio12 | 0 | 1 | 835.96 | 485.86 | 47.00 | 466.00 | 737.00 | 1118.00 | 4175.00 | ▇▅▁▁▁ |
| WC_bio7 | 0 | 1 | 37.80 | 8.23 | 13.60 | 32.80 | 37.60 | 43.40 | 56.50 | ▁▂▇▆▃ |
| lon | 0 | 1 | -101.11 | 20.71 | -161.96 | -118.71 | -102.04 | -82.72 | -54.62 | ▁▅▇▇▃ |
| lat | 0 | 1 | 44.61 | 9.35 | 23.29 | 37.29 | 44.41 | 50.62 | 67.66 | ▂▆▇▅▂ |
# create training set with 80% of full data
d_split <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train <- rsample::training(d_split)
# show number of rows present is 0 vs 1
table(d$present)
##
## 0 1
## 9970 9955
table(d_train$present)
##
## 0 1
## 7976 7964
# run decision stump model
mdl <- rpart(
present ~ ., data = d_train,
control = list(
cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 15940
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 15940 7964 0 (0.5003764 0.4996236)
## 2) ER_topoWet>=10.975 7445 2071 0 (0.7218267 0.2781733) *
## 3) ER_topoWet< 10.975 8495 2602 1 (0.3062978 0.6937022) *
# plot tree
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)
# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 15940
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 15940 7964 0 (0.50037641 0.49962359)
## 2) ER_topoWet>=10.975 7445 2071 0 (0.72182673 0.27817327)
## 4) WC_bio12< 710.5 3595 461 0 (0.87176634 0.12823366) *
## 5) WC_bio12>=710.5 3850 1610 0 (0.58181818 0.41818182)
## 10) WC_bio1< 0.55 368 20 0 (0.94565217 0.05434783) *
## 11) WC_bio1>=0.55 3482 1590 0 (0.54336588 0.45663412)
## 22) lon< -81.79019 2022 691 0 (0.65825915 0.34174085)
## 44) lat< 44.95915 1489 349 0 (0.76561451 0.23438549) *
## 45) lat>=44.95915 533 191 1 (0.35834897 0.64165103) *
## 23) lon>=-81.79019 1460 561 1 (0.38424658 0.61575342) *
## 3) ER_topoWet< 10.975 8495 2602 1 (0.30629782 0.69370218)
## 6) WC_bio11< -12.85 1038 174 0 (0.83236994 0.16763006) *
## 7) WC_bio11>=-12.85 7457 1738 1 (0.23306960 0.76693040)
## 14) WC_bio12< 390.5 666 213 0 (0.68018018 0.31981982) *
## 15) WC_bio12>=390.5 6791 1285 1 (0.18922103 0.81077897) *
rpart.plot(mdl)
# plot complexity parameter
plotcp(mdl)
# rpart cross validation results
mdl$cptable
## CP nsplit rel error xerror xstd
## 1 0.41323456 0 1.0000000 1.0195881 0.007925122
## 2 0.08663988 1 0.5867654 0.5873933 0.007218758
## 3 0.03013561 2 0.5001256 0.5015068 0.006869733
## 4 0.01414699 3 0.4699900 0.4718734 0.006729182
## 5 0.01000000 7 0.4085886 0.4142391 0.006422535
# caret cross validation results
mdl_caret <- train(
present ~ .,
data = d_train,
method = "rpart",
trControl = trainControl(method = "cv", number = 10),
tuneLength = 20)
ggplot(mdl_caret)
vip(mdl_caret, num_features = 40, bar = FALSE)
# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio11") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio11")) %>%
plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE,
colorkey = TRUE, screen = list(z = -20, x = -60))
# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)
## Random Forests
# number of features
n_features <- length(setdiff(names(d_train), "present"))
# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)
# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.3242596
# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
present ~ ., data = d_train,
importance = "impurity")
# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
present ~ ., data = d_train,
importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)
gridExtra::grid.arrange(p1, p2, nrow = 1)
### Question: How might variable importance differ between rpart and RandomForest in your model outputs? # NEED ANSWER HERE
# load packages
librarian::shelf(
dismo, # species distribution modeling: maxent(), predict(), evaluate(),
dplyr, ggplot2, GGally, here, maptools, readr,
raster, readr, rsample, sf,
usdm) # uncertainty analysis for species distribution models: vifcor()
select = dplyr::select
# options
set.seed(42)
options(
scipen = 999,
readr.show_col_types = F)
ggplot2::theme_set(ggplot2::theme_light())
# paths
dir_data <- here("data/sdm")
pts_geo <- file.path(dir_data, "pts.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
mdl_maxv_rds <- file.path(dir_data, "mdl_maxent_vif.rds")
# read points of observation: presence (1) and absence (0)
pts <- read_sf(pts_geo)
# read raster stack of environment
env_stack <- raster::stack(env_stack_grd)
# create training set with 80% of full data
pts_split <- rsample::initial_split(
pts, prop = 0.8, strata = "present")
pts_train <- rsample::training(pts_split)
pts_test <- rsample::testing(pts_split)
pts_train_p <- pts_train %>%
filter(present == 1) %>%
as_Spatial()
pts_train_a <- pts_train %>%
filter(present == 0) %>%
as_Spatial()
# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)
# calculate variance inflation factor per predictor, a metric of multicollinearity between variables
vif(env_stack)
## Variables VIF
## 1 WC_alt 4.656657
## 2 WC_bio1 538.646564
## 3 WC_bio11 3391.138926
## 4 WC_bio10 1138.941779
## 5 WC_bio4 1448.864541
## 6 ER_tri 4.255321
## 7 ER_topoWet 4.520922
## 8 WC_bio12 2.936697
## 9 WC_bio7 41.529852
# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th=0.7)
v
## 4 variables from the 9 input variables have collinearity problem:
##
## WC_bio11 WC_bio4 WC_bio1 ER_topoWet
##
## After excluding the collinear variables, the linear correlation coefficients ranges between:
## min correlation ( WC_bio10 ~ WC_alt ): -0.1181368
## max correlation ( WC_bio7 ~ WC_bio12 ): -0.6266193
##
## ---------- VIFs of the remained variables --------
## Variables VIF
## 1 WC_alt 1.983334
## 2 WC_bio10 1.802469
## 3 ER_tri 2.249538
## 4 WC_bio12 2.506921
## 5 WC_bio7 3.114756
# reduce enviromental raster stack by
env_stack_v <- usdm::exclude(env_stack, v)
# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)
# fit a maximum entropy model
if (!file.exists(mdl_maxv_rds)){
mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
readr::write_rds(mdl_maxv, mdl_maxv_rds)
}
mdl_maxv <- read_rds(mdl_maxv_rds)
# plot variable contributions per predictor
plot(mdl_maxv)
# plot term plots
response(mdl_maxv)
# predict
y_maxv <- predict(env_stack, mdl_maxv) #, ext=ext, progress='')
plot(y_maxv, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
pts_test_p <- pts_test %>%
filter(present == 1) %>%
as_Spatial()
pts_test_a <- pts_test %>%
filter(present == 0) %>%
as_Spatial()
y_maxv <- predict(mdl_maxv, env_stack)
#plot(y_maxv)
e <- dismo::evaluate(
p = pts_test_p,
a = pts_test_a,
model = mdl_maxv,
x = env_stack)
e
## class : ModelEvaluation
## n presences : 1992
## n absences : 1996
## AUC : 0.8565425
## cor : 0.6058625
## max TPR+TNR at : 0.6306873
plot(e, 'ROC')
thr <- threshold(e)[['spec_sens']]
thr
## [1] 0.6306873
p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)
# (t)rue/(f)alse (p)ositive/(n)egative rates
tpr <- sum(p_true)/length(p_true)
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)
matrix(
c(tpr, fnr,
fpr, tnr),
nrow=2, dimnames = list(
c("present_obs", "absent_obs"),
c("present_pred", "absent_pred")))
## present_pred absent_pred
## present_obs 0.8398594 0.2765531
## absent_obs 0.1601406 0.7234469
# add point to ROC plot
plot(e, 'ROC')
points(fpr, tpr, pch=23, bg="blue")
plot(y_maxv > thr)